Allstate Kaggle EDA: https://www.kaggle.com/c/allstate-claims-severity Training set is 188318 rows 131 variables (without loss variable) id + 116 categorical + 14 continuous variables
Test set is 125546 rows Total data set is 313864 items.
Possible Insights: - All the categorical values are between 0 and 1
summary(train)
## id cat1 cat2 cat3 cat4 cat5
## Min. : 1 A:141550 A:106721 A:177993 A:128395 A:123737
## 1st Qu.:147748 B: 46768 B: 81597 B: 10325 B: 59923 B: 64581
## Median :294540
## Mean :294136
## 3rd Qu.:440680
## Max. :587633
##
## cat6 cat7 cat8 cat9 cat10 cat11
## A:131693 A:183744 A:177274 A:113122 A:160213 A:168186
## B: 56625 B: 4574 B: 11044 B: 75196 B: 28105 B: 20132
##
##
##
##
##
## cat12 cat13 cat14 cat15 cat16 cat17
## A:159825 A:168851 A:186041 A:188284 A:181843 A:187009
## B: 28493 B: 19467 B: 2277 B: 34 B: 6475 B: 1309
##
##
##
##
##
## cat18 cat19 cat20 cat21 cat22 cat23
## A:187331 A:186510 A:188114 A:187905 A:188275 A:157445
## B: 987 B: 1808 B: 204 B: 413 B: 43 B: 30873
##
##
##
##
##
## cat24 cat25 cat26 cat27 cat28 cat29
## A:181977 A:169969 A:177119 A:168250 A:180938 A:184593
## B: 6341 B: 18349 B: 11199 B: 20068 B: 7380 B: 3725
##
##
##
##
##
## cat30 cat31 cat32 cat33 cat34 cat35
## A:184760 A:182980 A:187107 A:187361 A:187734 A:188105
## B: 3558 B: 5338 B: 1211 B: 957 B: 584 B: 213
##
##
##
##
##
## cat36 cat37 cat38 cat39 cat40 cat41
## A:156313 A:165729 A:169323 A:183393 A:180119 A:181177
## B: 32005 B: 22589 B: 18995 B: 4925 B: 8199 B: 7141
##
##
##
##
##
## cat42 cat43 cat44 cat45 cat46 cat47
## A:186623 A:184110 A:172716 A:183991 A:187436 A:187617
## B: 1695 B: 4208 B: 15602 B: 4327 B: 882 B: 701
##
##
##
##
##
## cat48 cat49 cat50 cat51 cat52 cat53
## A:188049 A:179127 A:137611 A:187071 A:179505 A:172949
## B: 269 B: 9191 B: 50707 B: 1247 B: 8813 B: 15369
##
##
##
##
##
## cat54 cat55 cat56 cat57 cat58 cat59
## A:183762 A:188173 A:188136 A:185296 A:188079 A:188018
## B: 4556 B: 145 B: 182 B: 3022 B: 239 B: 300
##
##
##
##
##
## cat60 cat61 cat62 cat63 cat64 cat65
## A:187872 A:187596 A:188273 A:188239 A:188271 A:186056
## B: 446 B: 722 B: 45 B: 79 B: 47 B: 2262
##
##
##
##
##
## cat66 cat67 cat68 cat69 cat70 cat71
## A:179982 A:187626 A:188176 A:188011 A:188295 A:178646
## B: 8336 B: 692 B: 142 B: 307 B: 23 B: 9672
##
##
##
##
##
## cat72 cat73 cat74 cat75 cat76 cat77
## A:118322 A:154275 A:184731 A:154307 A:181347 A: 49
## B: 69996 B: 34017 B: 3561 B: 34010 B: 6183 B: 358
## C: 26 C: 26 C: 1 C: 788 C: 408
## D:187503
##
##
##
## cat78 cat79 cat80 cat81 cat82 cat83
## A: 788 A: 7064 A: 783 A: 788 A: 19322 A: 26038
## B:186526 B:152929 B: 46538 B: 24132 B:147536 B:141534
## C: 645 C: 1668 C: 3492 C: 9013 C: 2655 C: 4958
## D: 359 D: 26657 D:137505 D:154385 D: 18805 D: 15788
##
##
##
## cat84 cat85 cat86 cat87 cat88 cat89
## A: 29450 A: 788 A: 1589 A: 788 A:168926 A :183744
## B: 431 B:186005 B:103852 B:166992 B: 7 B : 4312
## C:154939 C: 1011 C: 10290 C: 8819 D: 19302 C : 220
## D: 3498 D: 514 D: 72587 D: 11719 E: 83 D : 33
## E : 5
## I : 2
## (Other): 2
## cat90 cat91 cat92 cat93 cat94 cat95
## A:177993 A :111028 A:124689 A: 432 A: 738 A: 3736
## B: 9515 B : 42630 B: 628 B: 1133 B: 51710 B: 109
## C: 728 G : 26734 C: 62 C: 35788 C: 13623 C:87531
## D: 70 C : 6400 D: 11 D:150237 D:121642 D:79525
## E: 6 D : 1149 F: 1 E: 728 E: 91 E:17417
## F: 4 E : 254 H: 62901 F: 494
## G: 2 (Other): 123 I: 26 G: 20
## cat96 cat97 cat98 cat99 cat100
## E :174360 A:41970 A:105492 P :79455 F :42970
## D : 7922 B: 34 B: 542 T :72591 I :39933
## B : 2957 C:78127 C: 21485 R :10290 L :19961
## G : 2665 D: 3779 D: 50557 D : 8844 K :13817
## F : 343 E:47450 E: 10242 S : 7045 G :12935
## A : 35 F: 213 N : 2894 J :12027
## (Other): 36 G:16745 (Other): 7199 (Other):46675
## cat101 cat102 cat103 cat104
## A :106721 A :177274 A :123737 E :42925
## D : 17171 B : 5155 B : 33342 G :40660
## C : 16971 C : 4929 C : 16508 D :27611
## G : 10944 E : 482 D : 7806 F :19228
## F : 10139 D : 449 E : 4473 H :17187
## J : 7259 G : 15 F : 1528 K :14297
## (Other): 19113 (Other): 14 (Other): 924 (Other):26410
## cat105 cat106 cat107 cat108
## E :76493 G :47165 F :47310 B :65512
## F :62892 H :37713 G :28560 K :42435
## G :20613 F :36143 H :23461 G :21421
## D :12172 I :21433 J :22405 D :19160
## H :11258 J :18281 K :20236 F :10242
## I : 2941 E :13000 I :20066 A : 9299
## (Other): 1949 (Other):14583 (Other):26280 (Other):20249
## cat109 cat110 cat111 cat112
## BI :152918 CL :25305 A :128395 E :25148
## AB : 21933 EG :24654 C : 32401 AH :18639
## BU : 3142 CS :24592 E : 14682 AS :17669
## K : 2999 EB :21396 G : 7039 J :16222
## G : 1353 CO :17495 I : 3578 AF : 9368
## BQ : 1067 BT :16365 K : 1353 AN : 9138
## (Other): 4906 (Other):58511 (Other): 870 (Other):92134
## cat113 cat114 cat115 cat116
## BM :26191 A :131693 K :43866 HK : 21061
## AE :22030 C : 16793 O :26813 DJ : 20244
## L :13058 E : 16475 J :23895 CK : 10162
## AX :12661 J : 8199 N :22438 DP : 9202
## Y :11374 F : 7905 P :21538 GS : 8736
## K : 7738 N : 2455 L :16125 CR : 6862
## (Other):95266 (Other): 4798 (Other):33643 (Other):112051
## cont1 cont2 cont3 cont4
## Min. :0.000016 Min. :0.001149 Min. :0.002634 Min. :0.1769
## 1st Qu.:0.346090 1st Qu.:0.358319 1st Qu.:0.336963 1st Qu.:0.3274
## Median :0.475784 Median :0.555782 Median :0.527991 Median :0.4529
## Mean :0.493861 Mean :0.507188 Mean :0.498918 Mean :0.4918
## 3rd Qu.:0.623912 3rd Qu.:0.681761 3rd Qu.:0.634224 3rd Qu.:0.6521
## Max. :0.984975 Max. :0.862654 Max. :0.944251 Max. :0.9543
##
## cont5 cont6 cont7 cont8
## Min. :0.2811 Min. :0.01268 Min. :0.0695 Min. :0.2369
## 1st Qu.:0.2811 1st Qu.:0.33610 1st Qu.:0.3502 1st Qu.:0.3128
## Median :0.4223 Median :0.44094 Median :0.4383 Median :0.4411
## Mean :0.4874 Mean :0.49094 Mean :0.4850 Mean :0.4864
## 3rd Qu.:0.6433 3rd Qu.:0.65502 3rd Qu.:0.5910 3rd Qu.:0.6236
## Max. :0.9837 Max. :0.99716 Max. :1.0000 Max. :0.9802
##
## cont9 cont10 cont11 cont12
## Min. :0.00008 Min. :0.0000 Min. :0.03532 Min. :0.03623
## 1st Qu.:0.35897 1st Qu.:0.3646 1st Qu.:0.31096 1st Qu.:0.31166
## Median :0.44145 Median :0.4612 Median :0.45720 Median :0.46229
## Mean :0.48551 Mean :0.4981 Mean :0.49351 Mean :0.49315
## 3rd Qu.:0.56682 3rd Qu.:0.6146 3rd Qu.:0.67892 3rd Qu.:0.67576
## Max. :0.99540 Max. :0.9950 Max. :0.99874 Max. :0.99848
##
## cont13 cont14 loss
## Min. :0.000228 Min. :0.1797 Min. : 0.67
## 1st Qu.:0.315758 1st Qu.:0.2946 1st Qu.: 1204.46
## Median :0.363547 Median :0.4074 Median : 2115.57
## Mean :0.493138 Mean :0.4957 Mean : 3037.34
## 3rd Qu.:0.689974 3rd Qu.:0.7246 3rd Qu.: 3864.05
## Max. :0.988494 Max. :0.8448 Max. :121012.25
##
summary(test)
## id cat1 cat2 cat3 cat4 cat5
## Min. : 4 A:94096 A:71203 A:118752 A:86026 A:82282
## 1st Qu.:146414 B:31450 B:54343 B: 6794 B:39520 B:43264
## Median :294306
## Mean :294067
## 3rd Qu.:441800
## Max. :587634
##
## cat6 cat7 cat8 cat9 cat10 cat11
## A:88014 A:122546 A:118112 A:75509 A:106944 A:112470
## B:37532 B: 3000 B: 7434 B:50037 B: 18602 B: 13076
##
##
##
##
##
## cat12 cat13 cat14 cat15 cat16 cat17
## A:106777 A:112604 A:123954 A:125523 A:121262 A:124666
## B: 18769 B: 12942 B: 1592 B: 23 B: 4284 B: 880
##
##
##
##
##
## cat18 cat19 cat20 cat21 cat22 cat23
## A:124902 A:124376 A:125434 A:125291 A:125519 A:105272
## B: 644 B: 1170 B: 112 B: 255 B: 27 B: 20274
##
##
##
##
##
## cat24 cat25 cat26 cat27 cat28 cat29
## A:121430 A:113405 A:118077 A:112239 A:120751 A:123093
## B: 4116 B: 12141 B: 7469 B: 13307 B: 4795 B: 2453
##
##
##
##
##
## cat30 cat31 cat32 cat33 cat34 cat35
## A:123247 A:122061 A:124723 A:124914 A:125184 A:125417
## B: 2299 B: 3485 B: 823 B: 632 B: 362 B: 129
##
##
##
##
##
## cat36 cat37 cat38 cat39 cat40 cat41
## A:104035 A:110512 A:112774 A:122170 A:120081 A:120840
## B: 21511 B: 15034 B: 12772 B: 3376 B: 5465 B: 4706
##
##
##
##
##
## cat42 cat43 cat44 cat45 cat46 cat47
## A:124343 A:122811 A:114985 A:122647 A:124972 A:125055
## B: 1203 B: 2735 B: 10561 B: 2899 B: 574 B: 491
##
##
##
##
##
## cat48 cat49 cat50 cat51 cat52 cat53
## A:125366 A:119495 A:91888 A:124761 A:119761 A:115388
## B: 180 B: 6051 B:33658 B: 785 B: 5785 B: 10158
##
##
##
##
##
## cat54 cat55 cat56 cat57 cat58 cat59
## A:122577 A:125449 A:125435 A:123560 A:125393 A:125340
## B: 2969 B: 97 B: 111 B: 1986 B: 153 B: 206
##
##
##
##
##
## cat60 cat61 cat62 cat63 cat64 cat65
## A:125289 A:125024 A:125525 A:125501 A:125524 A:124021
## B: 257 B: 522 B: 21 B: 45 B: 22 B: 1525
##
##
##
##
##
## cat66 cat67 cat68 cat69 cat70 cat71
## A:119930 A:125112 A:125437 A:125351 A:125526 A:119187
## B: 5616 B: 434 B: 109 B: 195 B: 20 B: 6359
##
##
##
##
##
## cat72 cat73 cat74 cat75 cat76 cat77
## A:79486 A:102595 A:123055 A:102828 A:120866 A: 34
## B:46060 B: 22928 B: 2468 B: 22716 B: 4125 B: 264
## C: 23 C: 23 C: 2 C: 555 C: 272
## D:124976
##
##
##
## cat78 cat79 cat80 cat81 cat82 cat83 cat84
## A: 558 A: 4677 A: 552 A: 558 A:13026 A:17389 A: 19802
## B:124325 B:102007 B:30736 B: 15829 B:98004 B:94109 B: 283
## C: 445 C: 1168 C: 2419 C: 6245 C: 1735 C: 3452 C:103199
## D: 218 D: 17694 D:91839 D:102914 D:12781 D:10596 D: 2262
##
##
##
## cat85 cat86 cat87 cat88 cat89 cat90
## A: 558 A: 1126 A: 558 A:112427 A :122546 A:118752
## B:123963 B:68647 B:111306 B: 12 B : 2832 B: 6277
## C: 699 C: 6755 C: 5874 D: 13037 C : 141 C: 467
## D: 326 D:49018 D: 7808 E: 70 D : 20 D: 42
## E : 2 E: 5
## F : 2 F: 3
## (Other): 3
## cat91 cat92 cat93 cat94 cat95
## A :73787 A :83053 A: 308 A: 487 A: 2470
## B :28501 H :41969 B: 782 B:34558 B: 63
## G :17857 B : 448 C: 23948 C: 9190 C:58836
## C : 4369 C : 48 D:100031 D:80907 D:52693
## D : 785 I : 23 E: 477 E: 59 E:11484
## E : 148 D : 2 F: 333
## (Other): 99 (Other): 3 G: 12
## cat96 cat97 cat98 cat99 cat100
## E :116162 A:28475 A:69800 P :52645 F :28847
## D : 5414 B: 24 B: 382 T :49028 I :26686
## B : 1901 C:51813 C:14509 R : 6755 L :13277
## G : 1803 D: 2520 D:34126 D : 5739 K : 9035
## F : 229 E:31665 E: 6729 S : 4669 G : 8639
## A : 19 F: 133 N : 1877 J : 7969
## (Other): 18 G:10916 (Other): 4833 (Other):31093
## cat101 cat102 cat103 cat104
## A :71203 A:118112 A :82282 E :28632
## D :11648 B: 3505 B :22152 G :26966
## C :11244 C: 3305 C :11284 D :18250
## G : 7239 D: 291 D : 5180 F :12841
## F : 6882 E: 310 E : 3062 H :11275
## J : 4833 F: 7 F : 978 K : 9934
## (Other):12497 G: 16 (Other): 608 (Other):17648
## cat105 cat106 cat107 cat108
## E :50984 G :31015 F :31553 B :43219
## F :41920 H :24884 G :18665 K :28723
## G :13682 F :24377 H :15683 G :14478
## D : 8173 I :14441 J :15025 D :12788
## H : 7505 J :12092 K :13672 F : 6729
## I : 1926 E : 8871 I :13343 A : 6118
## (Other): 1356 (Other): 9866 (Other):17605 (Other):13491
## cat109 cat110 cat111 cat112
## BI :102134 CL :16792 A :86026 E :16961
## AB : 14356 CS :16404 C :21323 AH :12510
## BU : 2174 EG :16231 E : 9793 AS :11641
## K : 1957 EB :14569 G : 4560 J :10852
## G : 911 CO :11721 I : 2473 AN : 6182
## BQ : 721 BT :10902 K : 824 AF : 6130
## (Other): 3293 (Other):38927 (Other): 547 (Other):61270
## cat113 cat114 cat115 cat116
## BM :17565 A :88014 K :29161 HK :14015
## AE :14885 E :10994 O :17933 DJ :13730
## L : 8780 C :10953 J :15731 CK : 6685
## AX : 8393 F : 5534 N :14984 DP : 6185
## Y : 7772 J : 5371 P :14830 GS : 5913
## K : 5047 N : 1642 L :10518 CR : 4549
## (Other):63104 (Other): 3038 (Other):22389 (Other):74469
## cont1 cont2 cont3 cont4
## Min. :0.000016 Min. :0.001149 Min. :0.002634 Min. :0.1769
## 1st Qu.:0.347403 1st Qu.:0.358319 1st Qu.:0.336963 1st Qu.:0.3274
## Median :0.475784 Median :0.555782 Median :0.527991 Median :0.4529
## Mean :0.494447 Mean :0.506939 Mean :0.498255 Mean :0.4923
## 3rd Qu.:0.626630 3rd Qu.:0.681761 3rd Qu.:0.634224 3rd Qu.:0.6521
## Max. :0.984975 Max. :0.862654 Max. :0.944251 Max. :0.9560
##
## cont5 cont6 cont7 cont8
## Min. :0.2811 Min. :0.01268 Min. :0.0695 Min. :0.2369
## 1st Qu.:0.2811 1st Qu.:0.33610 1st Qu.:0.3521 1st Qu.:0.3180
## Median :0.4223 Median :0.44153 Median :0.4389 Median :0.4411
## Mean :0.4876 Mean :0.49219 Mean :0.4859 Mean :0.4874
## 3rd Qu.:0.6433 3rd Qu.:0.65926 3rd Qu.:0.5913 3rd Qu.:0.6292
## Max. :0.9831 Max. :0.99716 Max. :1.0000 Max. :0.9828
##
## cont9 cont10 cont11 cont12
## Min. :0.00008 Min. :0.0000 Min. :0.03532 Min. :0.03623
## 1st Qu.:0.35897 1st Qu.:0.3646 1st Qu.:0.31096 1st Qu.:0.31825
## Median :0.44145 Median :0.4667 Median :0.45720 Median :0.46229
## Mean :0.48602 Mean :0.4989 Mean :0.49436 Mean :0.49403
## 3rd Qu.:0.56889 3rd Qu.:0.6198 3rd Qu.:0.67892 3rd Qu.:0.68241
## Max. :0.99540 Max. :0.9950 Max. :0.99783 Max. :0.99742
##
## cont13 cont14
## Min. :0.000228 Min. :0.1786
## 1st Qu.:0.315758 1st Qu.:0.2948
## Median :0.363547 Median :0.4061
## Mean :0.495086 Mean :0.4956
## 3rd Qu.:0.689974 3rd Qu.:0.7248
## Max. :0.988494 Max. :0.8448
##
## [1] "id" "cat1" "cat2" "cat3" "cat4" "cat5" "cat6"
## [8] "cat7" "cat8" "cat9" "cat10" "cat11" "cat12" "cat13"
## [15] "cat14" "cat15" "cat16" "cat17" "cat18" "cat19" "cat20"
## [22] "cat21" "cat22" "cat23" "cat24" "cat25" "cat26" "cat27"
## [29] "cat28" "cat29" "cat30" "cat31" "cat32" "cat33" "cat34"
## [36] "cat35" "cat36" "cat37" "cat38" "cat39" "cat40" "cat41"
## [43] "cat42" "cat43" "cat44" "cat45" "cat46" "cat47" "cat48"
## [50] "cat49" "cat50" "cat51" "cat52" "cat53" "cat54" "cat55"
## [57] "cat56" "cat57" "cat58" "cat59" "cat60" "cat61" "cat62"
## [64] "cat63" "cat64" "cat65" "cat66" "cat67" "cat68" "cat69"
## [71] "cat70" "cat71" "cat72" "cat73" "cat74" "cat75" "cat76"
## [78] "cat77" "cat78" "cat79" "cat80" "cat81" "cat82" "cat83"
## [85] "cat84" "cat85" "cat86" "cat87" "cat88" "cat89" "cat90"
## [92] "cat91" "cat92" "cat93" "cat94" "cat95" "cat96" "cat97"
## [99] "cat98" "cat99" "cat100" "cat101" "cat102" "cat103" "cat104"
## [106] "cat105" "cat106" "cat107" "cat108" "cat109" "cat110" "cat111"
## [113] "cat112" "cat113" "cat114" "cat115" "cat116" "cont1" "cont2"
## [120] "cont3" "cont4" "cont5" "cont6" "cont7" "cont8" "cont9"
## [127] "cont10" "cont11" "cont12" "cont13" "cont14" "loss"
cat116_analysis = train %>%
select(cat116,loss) %>%
group_by(cat116) %>%
summarise(count=n()) %>%
arrange(desc(count))
datatable(cat116_analysis)
cat116_analysis2 = train %>%
select(cat116,loss) %>%
group_by(cat116) %>%
summarise(mean=mean(loss)) %>%
arrange(desc(mean))
datatable(cat116_analysis2)
## 51 Levels
table(train$cat112)
##
## A AA AB AC AD AE AF AG AH AI AJ AK
## 2411 1241 246 454 1531 834 9368 1331 18639 4749 144 6726
## AL AM AN AO AP AQ AR AS AT AU AV AW
## 1130 1170 9138 534 4000 30 2365 17669 1272 434 7122 3145
## AX AY B BA C D E F G H I J
## 1074 1414 423 190 2257 1645 25148 3149 3168 548 940 16222
## K L M N O P Q R S T U V
## 6059 493 439 8453 2183 406 793 1123 4201 521 8356 693
## W X Y
## 461 925 1351
M = cor(train[,118:132], method="pearson")
corrplot.mixed(M, upper = "circle", order="hclust")
preProc = preProcess(train, method = “nzv”) preProc